library(packrat)
library(tidyverse)
library(magrittr)
library(dplyr)
library(here)
library(ggplot2)
library(wordcloud)
library(wordcloud2)
library(RColorBrewer)
library(tm)
library(tidytext)
library(mapproj)
library(corpus)
library(ggmap)
library(maps)
library(ggrepel)
library(viridis)
here::here()
## [1] "/Users/thiyangashaminitalagala/Lecturer/1_TEACHING/2020_s2/statisticalConsultancyUSJ"
df <- read.csv("data_original.csv")

df <- as_tibble(df) 

df %>% head(5)
# A tibble: 5 x 118
     ID Consultant DateRetrieved DatePublished Job_title Company     R   SAS
  <int> <chr>      <chr>         <chr>         <chr>     <chr>   <int> <int>
1     2 Jayani     44020         31/07/2020    Junior D… Dialog…     1     0
2    26 Jayani     13/08/2020    13/08/2020    Lead Dat… Snap I…     0     0
3    27 Jayani     13/08/2020    13/08/2020    Smart Ma… Micron…     0     0
4    28 Jayani     13/08/2020    13/08/2020    Data Sci… Not_de…     0     0
5    29 Jayani     13/08/2020    13/08/2020    Data Sci… PROCTE…     1     0
# … with 110 more variables: SPSS <int>, Python <int>, MAtlab <int>,
#   Scala <int>, C_sharp <int>, MS.Word <int>, Ms.Excel <int>, OLE_DB <int>,
#   Ms.Access <int>, Ms.PowerPoint <int>, Spreadsheets <int>,
#   Data_visualization <int>, Presentation_Skills <int>, Communication <int>,
#   BigData <int>, Data_warehouse <int>, cloud_storage <int>,
#   Google_Cloud <int>, AWS <int>, Machine_Learning <int>, Deep.Learning <int>,
#   Computer_vision <int>, Java <int>, C_plus_plus <int>, C <int>,
#   Linux_Unix <int>, SQL <int>, NoSQL <int>, RDBMS <int>, Oracle <int>,
#   MySQL <int>, PHP <int>, Flash_Actionscript <int>, SPL <int>,
#   web_design_and_development_tools <int>, Wordpress <int>, AI <int>,
#   Natural_Language_Processing.NLP. <int>, Microsoft.Power.BI <int>,
#   Google_Analytics <int>, graphics_and_design_skills <int>,
#   Data_marketing <int>, SEO <int>, Content_Management <int>, Tableau <int>,
#   D3 <int>, Alteryx <int>, KNIME <int>, Spotfire <int>, Spark <int>,
#   S3 <int>, Redshift <int>, DigitalOcean <int>, Javascript <int>,
#   Kafka <int>, Storm <int>, Bash <int>, Hadoop <int>, Data_Pipelines <int>,
#   MPP_Platforms <int>, Qlik <int>, Pig <int>, Hive <int>, Tensorflow <int>,
#   Map_Reduce <int>, Impala <int>, Solr <int>, Teradata <int>, MongoDB <int>,
#   Elasticsearch <int>, YOLO <int>, agile.execution <int>,
#   Data_management <int>, pyspark <int>, Data_mining <int>,
#   Data_science <int>, Web_Analytic_tools <int>, IOT <int>,
#   Numerical_Analysis <int>, Economic <int>, Finance_Knowledge <int>,
#   Investment_Knowledge <int>, Problem_Solving <int>, Korean_language <int>,
#   Bash.Linux.Scripting <int>, Knowledge_in <chr>, Experience <chr>,
#   City <chr>, Location <chr>, Educational_qualifications <chr>, Salary <chr>,
#   Team_Handling <int>, Debtor_reconcilation <int>, Payroll_management <int>,
#   Bayesian <int>, Optimization <int>, Bahasa.Malaysia <int>,
#   English.proficiency <chr>, URL <chr>, Search_Term <chr>, …
df %>% class()
[1] "tbl_df"     "tbl"        "data.frame"
df %>% ncol()
[1] 118
df %>% colnames()
  [1] "ID"                                    
  [2] "Consultant"                            
  [3] "DateRetrieved"                         
  [4] "DatePublished"                         
  [5] "Job_title"                             
  [6] "Company"                               
  [7] "R"                                     
  [8] "SAS"                                   
  [9] "SPSS"                                  
 [10] "Python"                                
 [11] "MAtlab"                                
 [12] "Scala"                                 
 [13] "C_sharp"                               
 [14] "MS.Word"                               
 [15] "Ms.Excel"                              
 [16] "OLE_DB"                                
 [17] "Ms.Access"                             
 [18] "Ms.PowerPoint"                         
 [19] "Spreadsheets"                          
 [20] "Data_visualization"                    
 [21] "Presentation_Skills"                   
 [22] "Communication"                         
 [23] "BigData"                               
 [24] "Data_warehouse"                        
 [25] "cloud_storage"                         
 [26] "Google_Cloud"                          
 [27] "AWS"                                   
 [28] "Machine_Learning"                      
 [29] "Deep.Learning"                         
 [30] "Computer_vision"                       
 [31] "Java"                                  
 [32] "C_plus_plus"                           
 [33] "C"                                     
 [34] "Linux_Unix"                            
 [35] "SQL"                                   
 [36] "NoSQL"                                 
 [37] "RDBMS"                                 
 [38] "Oracle"                                
 [39] "MySQL"                                 
 [40] "PHP"                                   
 [41] "Flash_Actionscript"                    
 [42] "SPL"                                   
 [43] "web_design_and_development_tools"      
 [44] "Wordpress"                             
 [45] "AI"                                    
 [46] "Natural_Language_Processing.NLP."      
 [47] "Microsoft.Power.BI"                    
 [48] "Google_Analytics"                      
 [49] "graphics_and_design_skills"            
 [50] "Data_marketing"                        
 [51] "SEO"                                   
 [52] "Content_Management"                    
 [53] "Tableau"                               
 [54] "D3"                                    
 [55] "Alteryx"                               
 [56] "KNIME"                                 
 [57] "Spotfire"                              
 [58] "Spark"                                 
 [59] "S3"                                    
 [60] "Redshift"                              
 [61] "DigitalOcean"                          
 [62] "Javascript"                            
 [63] "Kafka"                                 
 [64] "Storm"                                 
 [65] "Bash"                                  
 [66] "Hadoop"                                
 [67] "Data_Pipelines"                        
 [68] "MPP_Platforms"                         
 [69] "Qlik"                                  
 [70] "Pig"                                   
 [71] "Hive"                                  
 [72] "Tensorflow"                            
 [73] "Map_Reduce"                            
 [74] "Impala"                                
 [75] "Solr"                                  
 [76] "Teradata"                              
 [77] "MongoDB"                               
 [78] "Elasticsearch"                         
 [79] "YOLO"                                  
 [80] "agile.execution"                       
 [81] "Data_management"                       
 [82] "pyspark"                               
 [83] "Data_mining"                           
 [84] "Data_science"                          
 [85] "Web_Analytic_tools"                    
 [86] "IOT"                                   
 [87] "Numerical_Analysis"                    
 [88] "Economic"                              
 [89] "Finance_Knowledge"                     
 [90] "Investment_Knowledge"                  
 [91] "Problem_Solving"                       
 [92] "Korean_language"                       
 [93] "Bash.Linux.Scripting"                  
 [94] "Knowledge_in"                          
 [95] "Experience"                            
 [96] "City"                                  
 [97] "Location"                              
 [98] "Educational_qualifications"            
 [99] "Salary"                                
[100] "Team_Handling"                         
[101] "Debtor_reconcilation"                  
[102] "Payroll_management"                    
[103] "Bayesian"                              
[104] "Optimization"                          
[105] "Bahasa.Malaysia"                       
[106] "English.proficiency"                   
[107] "URL"                                   
[108] "Search_Term"                           
[109] "Job_title_New"                         
[110] "Country"                               
[111] "Salary_Currency"                       
[112] "Min_Experience"                        
[113] "Statistical_software"                  
[114] "Programming_software"                  
[115] "database_softwares_and_query_languages"
[116] "Exp1"                                  
[117] "Location_New"                          
[118] "Min_Educational_qualifications"        
df %>% nrow()
[1] 423
df %>% tail(20)
# A tibble: 20 x 118
      ID Consultant DateRetrieved DatePublished Job_title Company     R   SAS
   <int> <chr>      <chr>         <chr>         <chr>     <chr>   <int> <int>
 1    86 Thimani    13/8/2020     21/7/2020     Data Sci… Affini…     1     1
 2    87 Thimani    13/8/2020     28/7/2020     Data Sci… Intern…     1     1
 3    60 Thimani    44051         30/7/2020     Data sci… Deutsc…     1     0
 4    61 Thimani    44051         43959         Data Sci… NICE A…     1     1
 5    83 Thimani    13/8/2020     25/7/2020     Data Sci… Figure…     0     0
 6    84 Thimani    13/8/2020     44172         Data Sci… Predic…     0     0
 7    20 Jayani     13/08/2020    13/08/2020    Data Ana… Ernst …     0     0
 8    21 Jayani     13/08/2020    44173         Data Ana… Fitch …     0     0
 9   161 Piyumika   44082         25/7/2020     Data Ana… E.D.Bu…     0     1
10   162 Piyumika   44082         25/7/2020     Senior D… E.D.Bu…     0     1
11   208 Nimesha    44082         44051         Junior D… Beer52      0     0
12   226 Nimesha    44173         44173         Junior D… Xcede       1     0
13   227 Nimesha    44173         44143         Data Ana… OneMag…     0     0
14   318 Rajitha    44113         43897         Data Sci… Facebo…     1     1
15   320 Rajitha    44113         43897         Data Sci… Apple …     1     0
16   319 Rajitha    44113         44111         Machine … Deloit…     0     0
17   169 Piyumika   44143         44032         Data Sci… Ness T…     0     0
18   170 Piyumika   44143         43971         Full Sta… Ness T…     0     0
19   336 Rajitha    15/09/2020    44112         Data Sci… Eighte…     0     0
20   262 Sanduni    44112         43929         Lead Dat… Target      1     0
# … with 110 more variables: SPSS <int>, Python <int>, MAtlab <int>,
#   Scala <int>, C_sharp <int>, MS.Word <int>, Ms.Excel <int>, OLE_DB <int>,
#   Ms.Access <int>, Ms.PowerPoint <int>, Spreadsheets <int>,
#   Data_visualization <int>, Presentation_Skills <int>, Communication <int>,
#   BigData <int>, Data_warehouse <int>, cloud_storage <int>,
#   Google_Cloud <int>, AWS <int>, Machine_Learning <int>, Deep.Learning <int>,
#   Computer_vision <int>, Java <int>, C_plus_plus <int>, C <int>,
#   Linux_Unix <int>, SQL <int>, NoSQL <int>, RDBMS <int>, Oracle <int>,
#   MySQL <int>, PHP <int>, Flash_Actionscript <int>, SPL <int>,
#   web_design_and_development_tools <int>, Wordpress <int>, AI <int>,
#   Natural_Language_Processing.NLP. <int>, Microsoft.Power.BI <int>,
#   Google_Analytics <int>, graphics_and_design_skills <int>,
#   Data_marketing <int>, SEO <int>, Content_Management <int>, Tableau <int>,
#   D3 <int>, Alteryx <int>, KNIME <int>, Spotfire <int>, Spark <int>,
#   S3 <int>, Redshift <int>, DigitalOcean <int>, Javascript <int>,
#   Kafka <int>, Storm <int>, Bash <int>, Hadoop <int>, Data_Pipelines <int>,
#   MPP_Platforms <int>, Qlik <int>, Pig <int>, Hive <int>, Tensorflow <int>,
#   Map_Reduce <int>, Impala <int>, Solr <int>, Teradata <int>, MongoDB <int>,
#   Elasticsearch <int>, YOLO <int>, agile.execution <int>,
#   Data_management <int>, pyspark <int>, Data_mining <int>,
#   Data_science <int>, Web_Analytic_tools <int>, IOT <int>,
#   Numerical_Analysis <int>, Economic <int>, Finance_Knowledge <int>,
#   Investment_Knowledge <int>, Problem_Solving <int>, Korean_language <int>,
#   Bash.Linux.Scripting <int>, Knowledge_in <chr>, Experience <chr>,
#   City <chr>, Location <chr>, Educational_qualifications <chr>, Salary <chr>,
#   Team_Handling <int>, Debtor_reconcilation <int>, Payroll_management <int>,
#   Bayesian <int>, Optimization <int>, Bahasa.Malaysia <int>,
#   English.proficiency <chr>, URL <chr>, Search_Term <chr>, …
d_1211 <- df$Job_title_New %>% table() %>% as.data.frame()
names(d_1211) <- c('Job_title', 'Frequency')
d_1211 <- d_1211[order(d_1211$Frequency, decreasing = T),]

d_1211 <- d_1211 %>% mutate(Percentage = Frequency*100/sum(Frequency))
d_1211
##   Job_title Frequency Percentage
## 1        DS       211  49.881797
## 2        DA       119  28.132388
## 3     Other        31   7.328605
## 4        DE        20   4.728132
## 5   Analyst        14   3.309693
## 6 Statician         9   2.127660
## 7 Actuarial         8   1.891253
## 8        SP         6   1.418440
## 9 Scientist         5   1.182033
ggplot(d_1211, aes(x= reorder(Job_title, Frequency), y=Frequency))+ labs(y="Count of Jobs", x="Job Title") + geom_bar(stat = "identity", width = 0.5, fill="#FC4E07")+ ggtitle("Bar chart of Job Title") + coord_flip()

d_1211 <- df$Statistical_software %>% table() %>% as.data.frame()
names(d_1211) <- c('Statistical_software', 'Frequency')
d_1211 <- d_1211[order(d_1211$Frequency, decreasing = T),]

d_1211 <- d_1211 %>% mutate(Percentage = Frequency*100/sum(Frequency))
d_1211
##       Statistical_software Frequency Percentage
## 1               R & Python       124  29.314421
## 2 No Statistical softwares        94  22.222222
## 3                    Other        91  21.513002
## 4                   Python        88  20.803783
## 5          R, SAS & Python        26   6.146572
ggplot(d_1211, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Count of Jobs", x="Statistical software") + geom_bar(stat = "identity", width = 0.5, fill="#CC79A7")+ ggtitle("Bar chart of Statistical software") + coord_flip()

d_1211 <- df$Programming_software %>% table() %>% as.data.frame()
names(d_1211) <- c('Programming_software', 'Frequency')
d_1211 <- d_1211[order(d_1211$Frequency, decreasing = T),]

d_1211 <- d_1211 %>% mutate(Percentage = Frequency*100/sum(Frequency))
d_1211
##      Programming_software Frequency Percentage
## 1 No programming software       349 82.5059102
## 2                   Other        43 10.1654846
## 3               Java only        15  3.5460993
## 4              Spark only        15  3.5460993
## 5        1::0::1::0::0::1         1  0.2364066
ggplot(d_1211, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Count of Jobs", x="Programming software") + geom_bar(stat = "identity", width = 0.5, fill="#009E73")+ ggtitle("Bar chart of Programming software") + coord_flip()

d_1211 <- df$database_softwares_and_query_languages %>% table() %>% as.data.frame()
names(d_1211) <- c('database_softwares_and_query_languages', 'Frequency')
d_1211 <- d_1211[order(d_1211$Frequency, decreasing = T),]

d_1211 <- d_1211 %>% mutate(Percentage = Frequency*100/sum(Frequency))
d_1211
##       database_softwares_and_query_languages Frequency Percentage
## 1 No database software and/or query language       169  39.952719
## 2                                   SQL only       167  39.479905
## 3                                      Other        35   8.274232
## 4                           SQL & MySQL only        27   6.382979
## 5                         SQL & Handoop only        13   3.073286
## 6                           SQL & NoSQL only        12   2.836879
ggplot(d_1211, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Count of Jobs", x="database softwares and query languages") + geom_bar(stat = "identity", width = 0.5, fill="#56B4E9")+ ggtitle("database softwares and query languages") + coord_flip()

df$Exp1 <- as.factor(df$Exp1)
d12 <- as.data.frame(table(df$Job_title_New,df$Exp1))
names(d12) <- c('Job_title','Min_Experience', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]

df14 <- df13

ggplot(df14, aes(y = No_of_cases, x=Job_title, fill = Min_Experience))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Minimum Experience and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DS") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#CC79A7")+ ggtitle("Bar chart of DS people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by Experience") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(Exp1)
d12 <- as.data.frame(table(dd1$Exp1))
names(d12) <- c('Min_Experience', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Experience, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Experience") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by Experience") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Statistical_software))
names(d12) <- c('Job_title','Statistical_software', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]

df14 <- df13

ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Statistical_software))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Statistical_software and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by Statistical software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(Statistical_software)
d12 <- as.data.frame(table(dd1$Statistical_software))
names(d12) <- c('Statistical_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Statistical_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Statistical_software") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by Statistical software") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Programming_software))
names(d12) <- c('Job_title','Programming_software', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]

df14 <- df13

ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Programming_software))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Programming_software and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by Programming software") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(Programming_software)
d12 <- as.data.frame(table(dd1$Programming_software))
names(d12) <- c('Programming_software', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Programming_software, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by Programming software") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$database_softwares_and_query_languages))
names(d12) <- c('Job_title','database_softwares_and_query_languages', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]

df14 <- df13

ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = database_softwares_and_query_languages))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by database_softwares_and_query_languages and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by database_softwares_and_query_languages") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(database_softwares_and_query_languages)
d12 <- as.data.frame(table(dd1$database_softwares_and_query_languages))
names(d12) <- c('database_softwares_and_query_languages', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(database_softwares_and_query_languages, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="database_softwares_and_query_languages") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by database_softwares_and_query_languages") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Min_Educational_qualifications))
names(d12) <- c('Job_title','Min_Educational_qualifications', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]

df14 <- df13

ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Min_Educational_qualifications))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Min_Educational_qualifications and Job title") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DA") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of DA people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="DE") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#E69F00")+ ggtitle("Bar chart of DE people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Analyst") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#999999")+ ggtitle("Bar chart of Analyst people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Statician") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#56B4E9")+ ggtitle("Bar chart of Statician people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Actuarial") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="black")+ ggtitle("Bar chart of Actuarial people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="SP") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Programming_software") + geom_bar(stat = "identity", width = 0.5,fill="#E7B800")+ ggtitle("Bar chart of SP people by Min_Educational_qualifications") + coord_flip()

dd1 <- df %>% filter(Job_title_New =="Scientist") %>% select(Min_Educational_qualifications)
d12 <- as.data.frame(table(dd1$Min_Educational_qualifications))
names(d12) <- c('Min_Educational_qualifications', 'Frequency')
d12 <- d12[order(d12$Frequency, decreasing = T),]

ggplot(d12, aes(x= reorder(Min_Educational_qualifications, Frequency), y=Frequency))+ labs(y="Number of Jobs", x="Min_Educational_qualifications") + geom_bar(stat = "identity", width = 0.5,fill="#00AFBB")+ ggtitle("Bar chart of Scientist people by Min_Educational_qualifications") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Salary_Currency))
names(d12) <- c('Job_title','Salary_Currency', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]

df14 <- df13

ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Salary_Currency))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Salary_Currency and Job title") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Salary))
names(d12) <- c('Job_title','Salary', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]

df14 <- df13

ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Salary))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Salary and Job title") + coord_flip()

d12 <- as.data.frame(table(df$Job_title_New,df$Country))
names(d12) <- c('Job_title','Country', 'No_of_cases')
d12 <- d12 %>% mutate(label_ypos = cumsum(No_of_cases)-0.5*No_of_cases)
df13 <- d12[order(d12$No_of_cases, decreasing = T),]

df14 <- df13

ggplot(df14, aes(y= No_of_cases, x=Job_title, fill = Country))+ labs(y="Number of Jobs", x="Job_title") + geom_bar(stat = "identity", width = 0.5)+ ggtitle("Bar chart of people by Country and Job title") + coord_flip()

text3 <- df$URL

docs2 <- Corpus(VectorSource(text3))
docs2 <- docs2%>% tm_map(stripWhitespace) %>% tm_map(removePunctuation) %>% tm_map(removeNumbers)
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
dtm2 <- TermDocumentMatrix(docs2)
matrix2 <- as.matrix(dtm2)

words2 <- sort(rowSums(matrix2), decreasing = TRUE)

df2 <- data.frame(word = names(words2), freq = words2)

p <- wordcloud2(data = df2, size = 0.9,color = 'random-dark', shape = 'pentagon')
p